# Required Packages
import pandas as pd
import numpy as np
import json
import folium
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('ggplot')
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
plt.rcParams['text.color'] = 'k'
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
In this article, we use a dataset contains Environmental Health Violations for Restaurants and Markets in Los Angeles County. This dataset can be found here and here.
Los Angeles County Environmental Health is responsible for checking food violations for all unincorporated areas and 85 of the 88 cities in the County. This dataset does not include Pasadena, Long Beach or Vernon (each has its own city health department).
The data that has been processed before.
Violations_df = pd.read_csv('Data/Clean_Violations_df.csv')
Inspections_df = pd.read_csv('Data/Clean_Inspections_df.csv')
Data = pd.read_csv('Data/Data_Inspections_Violations_df.csv')
Violation_Dictionary_df = pd.read_csv('Data/Violation_Dictionary_df.csv')
Violations_matrix = pd.read_csv('Data/Violations_matrix_df.csv')
Data['Facility Zip'] = Data['Facility Zip'].astype(str)
Data['Facility Zip'] = Data['Facility Zip'].apply(lambda x: x[:5])
Facilities_Geographical_df = Data.groupby('Facility Zip').agg(Average_Score=('Score', 'mean'),
Total_Facilities=('Facility ID', pd.Series.nunique)).reset_index()
Facilities_Geographical_df = Facilities_Geographical_df.rename(columns = {'Average_Score':'Average Score',
'Total_Facilities':'Total Facilities'})
Facilities_Geographical_df = Facilities_Geographical_df.dropna()
Facilities_Geographical_df.head().style.hide_index()
Next, we can export a geojson file from here. However, we would like to reduce the size of this file for the sake of having an optimal computation.
# loading the GeoJSON file
with open('Data/LA_ZIP_Codes.geojson', 'r') as jsonFile:
Zipcode_Data = json.load(jsonFile)
# Creating a list from Zip codes
mylist = Facilities_Geographical_df['Facility Zip'].unique().tolist()
# removing ZIP codes that are not in our dataset
temp = []
for i in range(len(Zipcode_Data['features'])):
if Zipcode_Data['features'][i]['properties']['name'] in mylist:
temp.append(Zipcode_Data['features'][i])
# creating a new JSON file
Reduced_Zipcode_Data = dict.fromkeys(['type','features'])
Reduced_Zipcode_Data['type'] = 'FeatureCollection'
Reduced_Zipcode_Data['features'] = temp
del Zipcode_Data, mylist, temp
# save the JSON file
open("Data/reduced_LA_ZIP_Codes.json", "w").write(json.dumps(Reduced_Zipcode_Data,
sort_keys=True, indent=4, separators=(',', ': ')))
del Reduced_Zipcode_Data
def plot_map(Inp_Column, Text_Legend = '', Inp_Df = Facilities_Geographical_df, Zoom_Level=8):
# reading of the updated GeoJSON file
Geographical_Data = r'Data/reduced_LA_ZIP_Codes.json'
# initiating a Folium map
m = folium.Map(location = [34.052, -118.243], zoom_start = Zoom_Level)
# creating a map
m.choropleth(geo_data = Geographical_Data, fill_opacity = 0.8, line_opacity = 0.2,
data = Inp_Df, key_on = 'feature.properties.name', columns = ['Facility Zip', Inp_Column],
fill_color = 'RdYlGn', legend_name = Text_Legend)
folium.LayerControl().add_to(m)
# Show the map
return m
plot_map('Average Score', 'The Average Score of Facilities')
plot_map('Total Facilities','Total Facilities')
Frist, let's addd the Violation Matrix to our Data
Data=pd.merge(Data, Violations_matrix, on='Facility ID', how='right')
Moreover, previously, we found the following violation code as the most correlated ones.
'F037', 'F040', 'F007', 'F044', 'MF34', 'W007', 'W035', 'MF41', 'MF45', 'W017', 'W021', 'W034', 'MF31', 'W033', 'MF38', 'W032', 'MF15', 'MF08', 'F035', 'F033', 'W044', 'MF36', 'W011', 'W031'
Thus,
Facility_Violations_df = Data.groupby(['Facility Zip','Facility ID']).agg({'F037': np.mean,'F040': np.mean,'F007': np.mean,
'F044': np.mean,'MF34': np.mean,'F037': np.mean,'W007': np.mean,'W035': np.mean,'MF41': np.mean,
'W017': np.mean,'W021': np.mean,'W034': np.mean,'MF31': np.mean,'W033': np.mean,'MF38': np.mean,
'W032': np.mean,'MF15': np.mean,'MF08': np.mean,'F035': np.mean,'W044': np.mean,'MF36': np.mean,
'W011': np.mean,'W031': np.mean})
Facility_Violations_df = Facility_Violations_df.groupby(level=0).mean()
Facility_Violations_df.reset_index(inplace=True)
Facility_Violations_df.head().style.hide_index()
Here, we only plot the following columns
temp = Facility_Violations_df[0:1].any().to_dict()
mylist=list({k:v for k, v in temp.items() if v == True}.keys())
print(mylist[1:])
Therefore,
vCode = 'F037'
map_title = ('Average Violation: % s' % Violation_Dictionary_df[Violation_Dictionary_df['Violation Code']== vCode]['Violation Description'].values[0])
plot_map(vCode, map_title ,Facility_Violations_df)
vCode = 'F040'
map_title = ('Average Violation: % s' % Violation_Dictionary_df[Violation_Dictionary_df['Violation Code']== vCode]['Violation Description'].values[0])
plot_map(vCode, map_title ,Facility_Violations_df)
vCode = 'F007'
map_title = ('Average Violation: % s' % Violation_Dictionary_df[Violation_Dictionary_df['Violation Code']== vCode]['Violation Description'].values[0])
plot_map(vCode, map_title ,Facility_Violations_df)
vCode = 'F044'
map_title = ('Average Violation: % s' % Violation_Dictionary_df[Violation_Dictionary_df['Violation Code']== vCode]['Violation Description'].values[0])
plot_map(vCode, map_title ,Facility_Violations_df)
vCode = 'F035'
map_title = ('Average Violation: % s' % Violation_Dictionary_df[Violation_Dictionary_df['Violation Code']== vCode]['Violation Description'].values[0])
plot_map(vCode, map_title ,Facility_Violations_df)
PE_Seats_Counts = pd.DataFrame(Data.groupby(['Facility Zip','PE_Seats']).size().unstack(1)).reset_index()
PE_Seats_Counts.head().style.hide_index()
We would like to plot only the following columns:
temp = PE_Seats_Counts[0:1].any().to_dict()
mylist=list({k:v for k, v in temp.items() if v == True}.keys())
print(mylist[1:])
plot_map('0-30 SQ. FT.', 'Facility Total Count (0-30 SQ. FT.)' ,PE_Seats_Counts)
plot_map('1-1999 SQ. FT.', 'Facility Total Count (1-1999 SQ. FT.)' ,PE_Seats_Counts)
plot_map('151+ SQ. FT.', 'Facility Total Count (151+ SQ. FT.)' ,PE_Seats_Counts)
plot_map('31-60 SQ. FT.', 'Facility Total Count (31-60 SQ. FT.)' ,PE_Seats_Counts)
PE_type_counts = pd.DataFrame(Data.groupby(['Facility Zip','PE_Type']).size().unstack(1)).reset_index()
PE_type_counts.head().style.hide_index()
We only plot the following columns.
temp = PE_type_counts[0:1].any().to_dict()
mylist=list({k:v for k, v in temp.items() if v == True}.keys())
print(mylist[1:])
plot_map('Food Market Retail', 'PE Type: Food Market Retail' ,PE_type_counts)
plot_map('Restaurant', 'PE Type: Restaurant' ,PE_type_counts)
PE_Risk_Counts = pd.DataFrame(Data.groupby(['Facility Zip','PE_Risk']).size().unstack(1)).reset_index()
PE_Risk_Counts.head().style.hide_index()
plot_map('Low Risk', 'PE Risk: Low Risk' ,PE_Risk_Counts)
plot_map('Moderate Risk', 'PE Risk: Moderate Risk' ,PE_Risk_Counts)
plot_map('High Risk', 'PE Risk: High Risk' ,PE_Risk_Counts)